title: “Regression Project” author: “Mohammad Jamalzehi & Kosar Ghazali” date: “2023-07-14” output: word_document always_allow_html: yes —

Introduction

# loading useful libraries
library(tidyverse)
library(tibble)     # data frame printing
library(dplyr)      # data manipulation
library(knitr)      # creating tables
library(kableExtra) # styling tables
library(caret)      # fitting KNN
library(rpart)      # fitting trees
library(rpart.plot) # plotting trees
library(ggplot2)    # plotting charts
library(plotly)     # pie chart
library(corrplot)   #correlation matrix
library(gridExtra)  #arrange multi plots
library(randomForest) #Random Forest 
library(glmnet)      #lasso Regression
library(gbm)         #Gradient Boosting Regression
library(skimr)      #summarize the data
# Load data set
library(readr)
data = read_csv("D:/university/Semester 6/Applied Statistics Analysis/misc/Data/CO2.csv")
attach(data)

Data Description

# Generate the summary of data

# Skim the data
summary <- skim(data)
kable(summary, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
skim_type skim_variable n_missing complete_rate character.min character.max character.empty character.n_unique character.whitespace numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
character Make 0 1 3 13 0 42 0 NA NA NA NA NA NA NA NA
character Model 0 1 2 41 0 2048 0 NA NA NA NA NA NA NA NA
character Vehicle Class 0 1 7 24 0 16 0 NA NA NA NA NA NA NA NA
character Transmission 0 1 2 4 0 27 0 NA NA NA NA NA NA NA NA
character Fuel Type 0 1 1 1 0 5 0 NA NA NA NA NA NA NA NA
numeric Engine Size(L) 0 1 NA NA NA NA NA 3.160068 1.354170 0.9 2.0 3.0 3.7 8.4 ▇▇▃▂▁
numeric Cylinders 0 1 NA NA NA NA NA 5.615030 1.828307 3.0 4.0 6.0 6.0 16.0 ▇▇▁▁▁
numeric Fuel Consumption City (L/100 km) 0 1 NA NA NA NA NA 12.556534 3.500274 4.2 10.1 12.1 14.6 30.6 ▂▇▃▁▁
numeric Fuel Consumption Hwy (L/100 km) 0 1 NA NA NA NA NA 9.041706 2.224456 4.0 7.5 8.7 10.2 20.6 ▃▇▂▁▁
numeric Fuel Consumption Comb (L/100 km) 0 1 NA NA NA NA NA 10.975071 2.892506 4.1 8.9 10.6 12.6 26.1 ▃▇▂▁▁
numeric Fuel Consumption Comb (mpg) 0 1 NA NA NA NA NA 27.481652 7.231879 11.0 22.0 27.0 32.0 69.0 ▃▇▂▁▁
numeric CO2 Emissions(g/km) 0 1 NA NA NA NA NA 250.584699 58.512679 96.0 208.0 246.0 288.0 522.0 ▂▇▅▁▁
# detect NA Data
nan_count = sum(is.na(data))
nan_count 
## [1] 0

Data visualization

# Cylinders in vehicles
ggplot(data) +
  geom_histogram(aes(x = Cylinders), fill = "steelblue", color = "white", alpha = 0.6) +
  labs(title = "Distribution of Cylinders in Vehicles",
       x = "Cylinders",
       y = "Count") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() +
  geom_histogram(aes(x = `Fuel Consumption City (L/100 km)`),bins = 20, fill="orange", alpha=0.5, binwidth = 1, density = TRUE) +
  geom_histogram(aes(x = `Fuel Consumption Hwy (L/100 km)`),bins = 20, fill="blue", alpha=0.5, binwidth = 1, density = TRUE) +
  labs(title = "Fuel Consumption", x = "Fuel Consumption", y = "Density") +
  scale_fill_manual(values = c("orange", "blue")) +
  theme_minimal()

#Fuel Types used in Highways

plot_1 = ggplot(data, aes(x = `Fuel Consumption Hwy (L/100 km)`, fill = `Fuel Type`)) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.6) +
  labs(title = "Fuel types used in Highways", x = "Fuel Consumption Hwy", y = "Count") +
  theme_minimal()+
  theme(legend.position = "none")
plot_2 = ggplot(data, aes(x = `Fuel Consumption City (L/100 km)`, fill = `Fuel Type`)) +
  geom_histogram(binwidth = 0.5, color = "black", alpha = 0.6) +
  labs(title = "Fuel types used in City", x = "Fuel Consumption City", y = "Count") +
  theme_minimal()
grid.arrange(plot_1, plot_2, ncol = 2)

Fuel types Z and X are used dominantly both in highways and city which are “Premium gasoline” and “regular gasoline”

# median and IQR for varouis typle of vehicles

# Define the color palette
my_palette <- colorRampPalette(c("orange",'yellow','limegreen', 'steelblue', "pink"))

# Create the plot
ggplot(data) +
  geom_boxplot(aes(x = Make, y = `CO2 Emissions(g/km)`, fill = Make)) +
  labs(title = "Median and IQR for Various Types of Vehicles",
       x = "Make",
       y = "CO2 Emissions (g/km)") +
  scale_fill_manual(values = my_palette(length(unique(data$Make)))) +
  theme_minimal() +
    guides(fill = FALSE) + 
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text.x = element_text(angle = 45, hjust = 1))

we could see that all models of Rolls-royce has emissions more than 350. Some models(outliers) in Mercedes-Benz and FORD are competing with the maximum emissions, however there is no solid evidence that the company that made the vehicle is necessarily correlated with the emission of CO2

transmission_distr = table(data$Transmission)
transmission_distr = data.frame(index = names(transmission_distr), Transmission = as.vector(transmission_distr))

plot_ly(transmission_distr, values = ~Transmission, labels = ~index, type = "pie") %>%
  layout(title = "Transmission Distribution")
ggplot(data, aes(x = `Fuel Consumption City (L/100 km)`, y = `CO2 Emissions(g/km)`)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linetype = "dashed") +
  labs(title = "Fuel Consumption City vs CO2 Emissions",
       x = "Fuel Consumption City (L/100 km)",
       y = "CO2 Emissions (g/km)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x = `Fuel Consumption Hwy (L/100 km)`, y = `CO2 Emissions(g/km)`)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linetype = "dashed") +
  labs(title = "Fuel Consumption Hwy vs CO2 Emissions",
       x = "Fuel Consumption Hwy (L/100 km)",
       y = "CO2 Emissions (g/km)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x = `Fuel Consumption Comb (mpg)`, y = `CO2 Emissions(g/km)`)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred", linetype = "dashed") +
  labs(title = "Fuel Consumption Comb (mpg)vs CO2 Emissions",
       x = "Fuel Consumption Comb (mpg)",
       y = "CO2 Emissions (g/km)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

there are two different distributions in these plots. a possible reason for that is the type of fuel

that has been consumed

# Create the plot
ggplot(data) +
  geom_boxplot(aes(x = `Fuel Type`, y = `Fuel Consumption Comb (mpg)`, fill = `Fuel Type`)) +
  labs(title = "Median and IQR for Various Types of Vehicles",
       x = 'Fuel Type',
       y = "CO2 Emissions (g/km)") +
  theme_minimal() +
    guides(fill = FALSE) + 
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.title = element_text(size = 12))

we can see that Fuel Type has a strong relationship with the Fuel Consumption.

E = Ethanol (E85) is making less CO2 then the other fuels

so, its important that we include fuel type in regression models.

ggplot(data, aes(x = `Vehicle Class`, y = `Fuel Consumption Comb (L/100 km)`),fill = `Vehicle Class`) +
  geom_boxplot() +  
  labs(title = "Median and IQR for Various Types of Vehicle Classes",
       x = "Vehicle Class",
       y = "Fuel Consumption") +
  theme_minimal() +
  guides(fill = FALSE) +
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text.x = element_text(angle = 45, hjust = 1))

as you can see in this chart, small cars consume less fuel than bigger vehicles. so vehicle class does contribute to the Co2 emission

# Create the plot
ggplot(data) +
  geom_boxplot(aes(x = Transmission, y = `Fuel Consumption Comb (L/100 km)`, fill = Transmission)) +
  labs(title = "Median and IQR for Various Types of Transmission",
       x = "Transmission",
       y = "Fuel Consumption Comb") +
  theme_minimal() +
    guides(fill = FALSE) + 
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.title = element_text(size = 12),
        axis.text.x = element_text(angle = 45, hjust = 1))

#numeric correlation
numeric_data =  subset(data, select = -c(Make,`Vehicle Class`,Model,Transmission, `Fuel Type`))

col = colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

#correalation matrix
cor_matrix = cor(numeric_data)
# Display the correlation matrix using corrplot with customized options
corrplot(cor_matrix,
         method = "circle",
         type = "upper",
         order = "hclust",
         tl.cex = 0.8,
         tl.col = "black",
         tl.srt = 45,
         col = col(200),
         addCoef.col = "gray",
         number.digits = 2,
         mar = c(0, 0, 1, 0),
         diag = FALSE,
         cl.pos = "n",
         cl.align = "b",
         tl.offset = 0.5,
         addrect = 2,
         rect.col = "lightgray",
         rect.lwd = 0.5)

Pre Processing

as you can see there are only one vehicles in the data set that uses the N type of fuel we surely can omit this vehicle since it has nearly no correlation with the response variable

data = data[data$`Fuel Type` != "N", ]
attach(data)
#augment the data with a variable called "Car Size"
data$car_size = log(`Engine Size(L)` * Cylinders * `Fuel Consumption Comb (L/100 km)`)

here we see the relation between car size and the CO2 emission

ggplot(data, aes(x = car_size, y = `Fuel Consumption Comb (L/100 km)`)) +
  geom_point(color = "steelblue", size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred",linetype ='dashed') +
  labs(x = "Car Size", y = "Fuel Consumption Comb (L/100 km)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x = car_size, y = `CO2 Emissions(g/km)`)) +
  geom_point(color = "steelblue", size = 3) +
  geom_smooth(method = "lm", se = FALSE, color = "darkred",linetype ='dashed') +
  labs(x = "Car Size", y = "CO2 Emissions(g/km)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

now we know that car size has a high correlation we the Fuel Consumption and CO2 now we want to see how Fuel Type and car size are correlated.

ggplot(data) +
  geom_boxplot(aes(x = `Fuel Type`, y = car_size, fill = `Fuel Type`)) +
  labs(title = "Median and IQR for Various Size of Vehicles",
       x = 'Fuel Type',
       y = "Size of vehicles") +
  theme_minimal() +
    guides(fill = FALSE) + 
  theme(plot.title = element_text(size = 16, face = "bold"),
        axis.title = element_text(size = 12))

here we can create a new correlation matrix including car-size variable

numeric_data_2 =  subset(data, select = -c(Make,`Vehicle Class`,Model,Transmission, `Fuel Type`))

col = colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

#correalation matrix
cor_matrix = cor(numeric_data_2)
# Display the correlation matrix using corrplot with customized options
corrplot(cor_matrix,
         method = "circle",
         type = "upper",
         order = "hclust",
         tl.cex = 0.8,
         tl.col = "black",
         tl.srt = 45,
         col = col(200),
         addCoef.col = "gray",
         number.digits = 2,
         mar = c(0, 0, 1, 0),
         diag = FALSE,
         cl.pos = "n",
         cl.align = "b",
         tl.offset = 0.5,
         addrect = 2,
         rect.col = "lightgray",
         rect.lwd = 0.5)

Converting variables to numeric.

numeric_data_3 <- data
numeric_data_4 <- 'Fuel Type'
prefix_for_data <- 'F'

for (i in 1:length(numeric_data_4)) {
  per <- prefix_for_data[i]
  col <- numeric_data_4[i]
  
  num <- model.matrix(~ . - 1, data = numeric_data_3[col])
  colnames(num) <- paste0(per, "_", colnames(num))
  
  numeric_data_3 <- cbind(numeric_data_3, num)
}

labels <- numeric_data_3$`CO2 Emissions(g/km)`
df <-  subset(numeric_data_3, select = -c(Make,`Vehicle Class`,Model,Transmission, `Fuel Type`))

as you can see car size has the most correlation with the response variable.

Renaming columns

colnames(df)[1] = "engin_size"
colnames(df)[3] = "Fuel_cons_city"
colnames(df)[4] = "Fuel_cons_hwy"
colnames(df)[5] = "Fuel_cons_comb"
colnames(df)[6] = "Fuel_cons_comb_mpg"
colnames(df)[7] = "CO2"
colnames(df)[9] = "FUEL_D"
colnames(df)[10] = "FUEL_E"
colnames(df)[11] = "FUEL_X"
colnames(df)[12] = "FUEL_Z"

Normalization

# Normalize each numeric variable in the numeric data frame
df_std = as.data.frame(scale(df))

Sorting the Data

#shifting the response variable to the end of dataframe
response <- "CO2"
column_index <- which(names(df) == response)

# Rearrange the columns
df <- df[, c(setdiff(1:ncol(df), column_index), column_index)]

# same way for df_std
column_index_std <- which(names(df_std) == response)

# Rearrange the columns
df_std <- df_std[, c(setdiff(1:ncol(df_std), column_index_std), column_index_std)]

Data Splitting

# train-test split
set.seed(12)
data_idx_train = sample(nrow(df),size = 0.8 * nrow(df))
train_set = df[data_idx_train, ]
test_set = df[-data_idx_train, ]
# train-test split (STD)
set.seed(12)
data_idx_train_std = sample(nrow(df_std),size = 0.8 * nrow(df_std))
train_set_std = df_std[data_idx_train_std, ]
test_set_std = df_std[-data_idx_train_std, ]
# estimation-validation split
data_idx_est = sample(nrow(train_set),size = 0.8 * nrow(train_set))
estimation = train_set[data_idx_est,]
validation = train_set[-data_idx_est,]

# estimation-validation split (STD)
data_idx_est_std = sample(nrow(train_set_std),size = 0.8 * nrow(train_set_std))
estimation_std = train_set_std[data_idx_est_std,]
validation_std = train_set_std[-data_idx_est_std,]

attach(estimation)

Normalization

Regression Analysis

1- Linear Regression

# R-squared and RMSE Calculator Functions

calc_RMSE = function(actual, predicted){
  sqrt(mean((actual-predicted)^2))
}

calc_r2 = function(actual,predicted){
  SSE = sum((actual - predicted)^2)
  SST = sum((actual - mean(actual))^2)
  1 - SSE / SST
}

Estimation

linear_models = list(
  model_1 = lm(CO2 ~ car_size ,estimation),
  model_2 = lm(CO2 ~ .,estimation),
  model_3 = step(lm(CO2 ~ . ^ 2,estimation),trace = F)
  
)

Validation

# validation prediction
linear_pred  = lapply(linear_models, predict, validation)

rmse_values_linear <- sapply(linear_pred, calc_RMSE, actual = validation$CO2)
r2_values_linear <- sapply(linear_pred, calc_r2, actual = validation$CO2 )

table_data_linear <- tibble(
  Model = names(linear_pred),
  RMSE = rmse_values_linear,
  R2 = r2_values_linear
)

print(table_data_linear)
## # A tibble: 3 × 3
##   Model    RMSE    R2
##   <chr>   <dbl> <dbl>
## 1 model_1 23.2  0.832
## 2 model_2  5.04 0.992
## 3 model_3  2.75 0.998
best_linear = step(lm(CO2 ~ . ^ 2,estimation),trace = F)
best_linear_r2 = 0.998
best_linear_RMSE = 2.753

2- KNN regression

Estimation

attach(estimation_std)
KNN_models = list()
for (i in 1:25) {
  KNN_models[[i]] = knnreg(CO2 ~ ., data = estimation_std, k=i)
}

Validation

knn_pred = lapply(KNN_models,predict, validation_std)
rmse_values_knn <- sapply(knn_pred, calc_RMSE,validation_std$CO2)
r2_values_knn <- sapply(knn_pred, calc_r2, validation_std$CO2)

table_data_knn <- tibble(
  k_value = 1:25,
  Model = names(knn_pred),
  RMSE = rmse_values_knn,
  R2 = r2_values_knn
)
table_data_knn
## # A tibble: 25 × 3
##    k_value   RMSE    R2
##      <int>  <dbl> <dbl>
##  1       1 0.0542 0.997
##  2       2 0.0554 0.997
##  3       3 0.0593 0.996
##  4       4 0.0592 0.996
##  5       5 0.0616 0.996
##  6       6 0.0649 0.996
##  7       7 0.0672 0.995
##  8       8 0.0690 0.995
##  9       9 0.0726 0.995
## 10      10 0.0741 0.994
## # ℹ 15 more rows
# Line plot for RMSE values
ggplot(table_data_knn, aes(x = k_value, y = R2)) +
  geom_line(color = "darkred",linewidth = 1) +
  labs(title = "R2 Values for KNN Models",
       x = "K Value",
       y = "R2") +
  theme_minimal()

best_knn = knnreg(CO2 ~ ., data = estimation_std, k=1)
best_knn_r2 = 0.9971
best_knn_RMSE = 0.0539683

3- Regression Tree

Estimation

attach(estimation)
reg_tree_all = rpart(CO2 ~ .,estimation)
rpart.plot(reg_tree_all)

tree_models = list(
  tree_1 = rpart(CO2~.,data = estimation,cp = 0.005,minsplit = 5),
  tree_2 = rpart(CO2~.,data = estimation,cp = 0.01,minsplit = 5),
  tree_3 = rpart(CO2~.,data = estimation,cp = 0.015,minsplit = 5),
  tree_4 = rpart(CO2~.,data = estimation,cp = 0.02,minsplit = 5),
  tree_5= rpart(CO2~.,data = estimation,cp = 0.018,minsplit = 20)
)
#variable importance
importances <- reg_tree_all$variable.importance

# Create a data frame with variable names and importances
df_VI <- data.frame(Variable = names(importances), Importance = importances)

# Sort the data frame by importance in descending order
df_VI <- df_VI[order(df_VI$Importance, decreasing = TRUE), ]

# Use kable function from knitr to create a table
as_tibble(df_VI)
## # A tibble: 9 × 2
##   Variable           Importance
##   <chr>                   <dbl>
## 1 Fuel_cons_comb      14139146.
## 2 Fuel_cons_comb_mpg  13846437.
## 3 Fuel_cons_city      13257462.
## 4 car_size            12162866.
## 5 Fuel_cons_hwy       10961811.
## 6 engin_size           9391369.
## 7 Cylinders            1468663.
## 8 FUEL_E                425124.
## 9 FUEL_X                 44553.

Validation

attach(validation)
tree_pred = lapply(tree_models, predict, validation)
rmse_values_tree <- sapply(tree_pred, calc_RMSE, validation$CO2)
r2_values_tree <- sapply(tree_pred, calc_r2, validation$CO2)

table_data_tree <- tibble(
  Model = names(tree_pred),
  RMSE = rmse_values_tree,
  R2 = r2_values_tree
)
table_data_tree
## # A tibble: 5 × 3
##   Model   RMSE    R2
##   <chr>  <dbl> <dbl>
## 1 tree_1  13.2 0.946
## 2 tree_2  16.0 0.920
## 3 tree_3  21.1 0.852
## 4 tree_4  22.5 0.827
## 5 tree_5  21.1 0.852
best_tree = rpart(CO2~.,data = estimation,cp = 0.005,minsplit = 5)
best_tree_r2 = 0.9458
best_tree_rmse = 13.95340

4- Random Forest Regression

Estimation

attach(estimation)
RF_models = list()
set.seed(12)
#for (i in 1:100) {
  #RF_models[[i]] = randomForest(CO2 ~ ., data = estimation, 
                         #ntree = i, mtry = sqrt(ncol(estimation)-1))
#}

Validation

attach(validation)
#RF_pred <- predict(RF_models, newdata = validation)

#rmse_values_RF <- sapply(RF_pred, calc_RMSE,validation$CO2)
#r2_values_RF <- sapply(RF_pred, calc_r2, validation$CO2)

#table_data_RF <- tibble(
  #k_value = 1:100,
 # Model = names(RF_pred),
 # RMSE = rmse_values_RF,
 # R2 = r2_values_RF
#)
#which.max(table_data_RF$R2)
#table_data_RF
best_rf = randomForest(CO2 ~ ., data = estimation,ntree = 16, mtry = sqrt(ncol(estimation)-1))
best_rf_r2 = 0.9966076  
best_rf_rmse = 3.273711

5- Lasso Regression

Estimation

# Separate predictors (X) and response variable (Y)
X <- as.matrix(estimation[, -12])
Y <- as.matrix(estimation[, 12])
#fitting the lasso model
lasso_model <- glmnet(X,Y, alpha = 1) # aplha = 1 is for lasso regression

Cross Validation

# Perform cross-validation to select optimal lambda (penalty parameter)
cross_validation <- cv.glmnet(X, Y, alpha = 1)
plot(cross_validation)

# Extract the optimal lambda based on minimum mean cross-validated error
optimal_lambda <- cross_validation$lambda.min
cat("Optimal Lambda:", optimal_lambda)
## Optimal Lambda: 0.1727899
# optimal lasso model
optimal_lasso <- glmnet(X,Y,aplha = 1,lambda = optimal_lambda)

Validation

# lasso Prediction 
X_validation = as.matrix(validation[,-12])
lasso_pred = predict(optimal_lasso, newx = X_validation)
rmse_values_lasso = calc_RMSE(validation$CO2,lasso_pred)
r2_values_lasso = calc_r2(validation$CO2,lasso_pred)
cat("R-squared value for Lasso regression:", r2_values_lasso)
## R-squared value for Lasso regression: 0.9918463
cat("\nRMSE value for Lasso regression:", rmse_values_lasso)
## 
## RMSE value for Lasso regression: 5.111651

6- Ridge Regression

Estimation

#fitting the Ridge Model
ridge_model <- glmnet(X,Y,alpha = 0)

Cross validation

# Perform cross-validation to select optimal lambda (penalty parameter)
cross_validation_ridge <- cv.glmnet(X, Y, alpha = 0)
plot(cross_validation_ridge)

# Extract the optimal lambda based on minimum mean cross-validated error
optimal_lambda_ridge <- cross_validation_ridge$lambda.min
cat("Optimal Lambda:", optimal_lambda_ridge)
## Optimal Lambda: 5.528011
# optimal Ridge model
optimal_ridge <- glmnet(X,Y,aplha = 0,lambda = optimal_lambda_ridge)

Validation

# Ridge Prediction 
X_validation_ridge = as.matrix(validation[,-12])
ridge_pred = predict(optimal_ridge, newx = X_validation)
rmse_values_ridge = calc_RMSE(validation$CO2,ridge_pred)
r2_values_ridge = calc_r2(validation$CO2,ridge_pred)
cat("R-squared value for Ridge regression:", r2_values_ridge)
## R-squared value for Ridge regression: 0.9520955
cat("\nRMSE value for Ridge regression:", rmse_values_ridge)
## 
## RMSE value for Ridge regression: 12.39002

7- Elastic Net Regression

Estimation

# Perform Elastic Net regression
enet_model <- glmnet(X, Y, alpha = 0.5)  # alpha = 0.5 for balanced Elastic Net

Cross Validation

# Perform cross-validation to select optimal lambda (penalty parameter)
cross_validation_enet =  cv.glmnet(X, Y, alpha = 0.5)
plot(cross_validation_enet)

# Extract the optimal lambda based on minimum mean cross-validated error
optimal_lambda_enet <- cross_validation_enet$lambda.min
cat("Optimal Lambda:", optimal_lambda_enet)
## Optimal Lambda: 0.1977535
# optimal Ridge model
optimal_enet <- glmnet(X,Y,aplha = 0.5,lambda = optimal_lambda_enet)

Validation

# Ridge Prediction 
X_validation_enet = as.matrix(validation[,-12])
enet_pred = predict(optimal_enet, newx = X_validation)
rmse_values_enet = calc_RMSE(validation$CO2,enet_pred)
r2_values_enet = calc_r2(validation$CO2,enet_pred)
cat("R-squared value for Ridge regression:", r2_values_enet)
## R-squared value for Ridge regression: 0.9918253
cat("\nRMSE value for Ridge regression:", rmse_values_enet)
## 
## RMSE value for Ridge regression: 5.118218

8 - Gradient Boosting Regression

Estimation

gbm_model <- gbm(CO2 ~ .,data = estimation,n.trees = 100,distribution = 'gaussian', interaction.depth = 4,shrinkage = 0.01)

Validation

#prediction and validation
gbm_pred <- predict(gbm_model, newdata = validation, n.trees = 100)
rmse_values_gbm = calc_RMSE(validation$CO2,gbm_pred)
r2_values_gbm = calc_r2(validation$CO2,gbm_pred)
cat("R-squared value for GBM regression:", r2_values_gbm)
## R-squared value for GBM regression: 0.7963832
cat("\nRMSE value for GBM regression:", rmse_values_gbm)
## 
## RMSE value for GBM regression: 25.54412

Best Model

# Load required librarieslibrary(knitr)
library(kableExtra)

# Create a data frame with the models, R-squared, and RMSE metrics
models <- c("Linear", "KNN", "Tree", "RandomForest", "Lasso", "Ridge", "Elastic Net", "Gradient Boosting")
r_squared <- c(best_linear_r2, best_knn_r2,best_tree_r2, best_rf_r2, r2_values_lasso, r2_values_ridge, r2_values_enet, r2_values_gbm)
rmse <- c(best_linear_RMSE, best_knn_RMSE,best_tree_rmse, best_rf_rmse, rmse_values_lasso, rmse_values_ridge, rmse_values_enet, rmse_values_gbm)

all_models <- tibble(
  Model = models,
  RMSE = rmse,
  R2 = r_squared
)
# Sort the models based on R2 in descending order
sorted_models <- all_models[order(all_models$R2, decreasing = TRUE), ]

# Print the sorted models
Final <- kable(sorted_models, "html", align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  add_header_above(c("Regression Models" = 3)) %>%
  add_footnote("Note: R-squared and RMSE are evaluation metrics for the regression models.")
Final
Regression Models
Model RMSE R2
Linear 2.7530000 0.9980000
KNN 0.0539683 0.9971000
RandomForest 3.2737110 0.9966076
Lasso 5.1116510 0.9918463
Elastic Net 5.1182180 0.9918253
Ridge 12.3900231 0.9520955
Tree 13.9534000 0.9458000
Gradient Boosting 25.5441164 0.7963832
a Note: R-squared and RMSE are evaluation metrics for the regression models.